The purpose of the notebook is to understand and visualize the distribution of various metrics in correlation with mentorship score. This will lead to insights which might be useful for the paper.
import networkx as nx
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import joblib
import argparse
from multiprocessing import Pool
import logging
import warnings
import glob
from joblib import Parallel, delayed
import collections
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import ast
from numpy import arange
from IPython.display import Image
from matplotlib.pyplot import figure
import plotly.express as px
logging.basicConfig(level=logging.INFO)
mentor and menteementor) to junior author (mentee)mentorship_score for that pair (pred_prob)A mentor has out edges to each mentee
mentorship_score:
mentorship_score_mean:
mentorship_score_median :
mentee_count :
Field of study (fos) :
copub_start and copub_end : copublication start and end dates
h-index is the h-index of the mentor
#!aws s3 cp --no-sign-request s3://ai2-s2-research-public/s2amp/inferred/mentors_s2_fos_scores.csv data/inferred/mentors_s2_fos_scores.csv
df_mentors = pd.read_csv('data/inferred/mentors_s2_fos_scores.csv')
df_mentors
| authors_ai2_id | h_index | paper_count | citation_count | affiliations | mentorship_score | mentorship_score_mean | menteeship_score | menteeship_score_mean | mentee_count | mentor_count | fos | log_mentee_count | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2562240 | 31 | 347 | 3730 | NaN | 63.163338 | 0.371549 | 0.947075 | 0.094707 | 170 | 10 | Physics | 7.409391 |
| 1 | 20772242 | 3 | 22 | 48 | ["University of Kashmir"] | 0.484438 | 0.242219 | 2.146073 | 0.536518 | 2 | 4 | Biology | 1.000000 |
| 2 | 6402240 | 8 | 12 | 172 | NaN | 0.046127 | 0.046127 | 2.988159 | 0.747040 | 1 | 4 | Biology | 0.000000 |
| 3 | 48852240 | 6 | 27 | 62 | NaN | 3.916424 | 0.652737 | 0.783597 | 0.783597 | 6 | 1 | Chemistry | 2.584963 |
| 4 | 15018242 | 4 | 45 | 69 | NaN | 0.835011 | 0.139169 | 1.210539 | 0.201757 | 6 | 6 | Education | 2.584963 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9371015 | 2118403769 | 5 | 20 | 191 | NaN | 0.458033 | 0.229016 | 1.508733 | 0.215533 | 2 | 7 | Computer Science | 1.000000 |
| 9371016 | 2110679076 | 13 | 39 | 1063 | ["Novartis"] | 1.726746 | 0.345349 | 4.071905 | 0.290850 | 5 | 14 | Computer Science | 2.321928 |
| 9371017 | 2111009961 | 2 | 6 | 21 | NaN | 0.280124 | 0.280124 | 1.832399 | 0.458100 | 1 | 4 | Computer Science | 0.000000 |
| 9371018 | 2109738049 | 15 | 33 | 1071 | NaN | 0.090947 | 0.006996 | 2.448788 | 0.204066 | 13 | 12 | Computer Science | 3.700440 |
| 9371019 | 2115257611 | 2 | 5 | 25 | NaN | 0.071483 | 0.023828 | 2.634749 | 0.526950 | 3 | 5 | Computer Science | 1.584963 |
9371020 rows × 13 columns
df_mentors['log_paper_count'] = np.log2(df_mentors['paper_count'])
df_mentors['log_mentee_count'] = np.log2(df_mentors['mentee_count'])
df_mentors['log_mentorship_score'] = np.log2(df_mentors['mentorship_score'])
df_mentors
| authors_ai2_id | h_index | paper_count | citation_count | affiliations | mentorship_score | mentorship_score_mean | menteeship_score | menteeship_score_mean | mentee_count | mentor_count | fos | log_mentee_count | log_paper_count | log_mentorship_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2562240 | 31 | 347 | 3730 | NaN | 63.163338 | 0.371549 | 0.947075 | 0.094707 | 170 | 10 | Physics | 7.409391 | 8.438792 | 5.981016 |
| 1 | 20772242 | 3 | 22 | 48 | ["University of Kashmir"] | 0.484438 | 0.242219 | 2.146073 | 0.536518 | 2 | 4 | Biology | 1.000000 | 4.459432 | -1.045616 |
| 2 | 6402240 | 8 | 12 | 172 | NaN | 0.046127 | 0.046127 | 2.988159 | 0.747040 | 1 | 4 | Biology | 0.000000 | 3.584963 | -4.438240 |
| 3 | 48852240 | 6 | 27 | 62 | NaN | 3.916424 | 0.652737 | 0.783597 | 0.783597 | 6 | 1 | Chemistry | 2.584963 | 4.754888 | 1.969537 |
| 4 | 15018242 | 4 | 45 | 69 | NaN | 0.835011 | 0.139169 | 1.210539 | 0.201757 | 6 | 6 | Education | 2.584963 | 5.491853 | -0.260132 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9371015 | 2118403769 | 5 | 20 | 191 | NaN | 0.458033 | 0.229016 | 1.508733 | 0.215533 | 2 | 7 | Computer Science | 1.000000 | 4.321928 | -1.126478 |
| 9371016 | 2110679076 | 13 | 39 | 1063 | ["Novartis"] | 1.726746 | 0.345349 | 4.071905 | 0.290850 | 5 | 14 | Computer Science | 2.321928 | 5.285402 | 0.788056 |
| 9371017 | 2111009961 | 2 | 6 | 21 | NaN | 0.280124 | 0.280124 | 1.832399 | 0.458100 | 1 | 4 | Computer Science | 0.000000 | 2.584963 | -1.835860 |
| 9371018 | 2109738049 | 15 | 33 | 1071 | NaN | 0.090947 | 0.006996 | 2.448788 | 0.204066 | 13 | 12 | Computer Science | 3.700440 | 5.044394 | -3.458831 |
| 9371019 | 2115257611 | 2 | 5 | 25 | NaN | 0.071483 | 0.023828 | 2.634749 | 0.526950 | 3 | 5 | Computer Science | 1.584963 | 2.321928 | -3.806263 |
9371020 rows × 15 columns
df_mentors.sample(n=100000, random_state=2)
| authors_ai2_id | h_index | paper_count | citation_count | affiliations | mentorship_score | mentorship_score_mean | menteeship_score | menteeship_score_mean | mentee_count | mentor_count | fos | log_mentee_count | log_paper_count | log_mentorship_score | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 8198406 | 144906163 | 6 | 82 | 187 | ["Northeast Normal University"] | 2.808951 | 0.280895 | 1.431177 | 0.477059 | 10 | 3 | Agricultural And Food Sciences | 3.321928 | 6.357552 | 1.490032 |
| 905695 | 46578846 | 8 | 34 | 552 | NaN | 2.384189 | 0.340598 | 0.274725 | 0.274725 | 7 | 1 | Medicine | 2.807355 | 5.087463 | 1.253498 |
| 1757135 | 31904437 | 5 | 16 | 64 | NaN | 0.408619 | 0.051077 | 2.043795 | 0.340632 | 8 | 6 | Medicine | 3.000000 | 4.000000 | -1.291172 |
| 4673111 | 6995996 | 14 | 28 | 743 | ["Blaise Pascal University"] | 0.089003 | 0.044502 | 5.778299 | 0.481525 | 2 | 12 | Materials Science | 1.000000 | 4.807355 | -3.490001 |
| 6261870 | 88229109 | 2 | 7 | 8 | NaN | 0.017446 | 0.017446 | 1.038826 | 0.519413 | 1 | 2 | Environmental Science | 0.000000 | 2.807355 | -5.840941 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 6416861 | 93110985 | 0 | 1 | 0 | NaN | 0.111461 | 0.037154 | 1.684144 | 0.842072 | 3 | 2 | Chemistry | 1.584963 | 0.000000 | -3.165387 |
| 1856281 | 4832073 | 21 | 48 | 1486 | ["Institute of Biochemistry and Biophysics, Po... | 15.526601 | 0.443617 | 2.243746 | 0.373958 | 35 | 6 | Biology | 5.129283 | 5.584963 | 3.956670 |
| 161333 | 6042358 | 1 | 5 | 4 | ["Laboratory of Swimming and Water Lifesaving,... | 0.186727 | 0.093363 | 1.364832 | 0.682416 | 2 | 2 | Medicine | 1.000000 | 2.321928 | -2.420999 |
| 5954815 | 87225716 | 1 | 6 | 4 | NaN | 0.162221 | 0.162221 | 0.596870 | 0.119374 | 1 | 5 | Medicine | 0.000000 | 2.584963 | -2.623965 |
| 4808117 | 51997260 | 1 | 36 | 1 | NaN | 11.232941 | 0.702059 | 1.302460 | 0.325615 | 16 | 4 | Biology | 4.000000 | 5.169925 | 3.489664 |
100000 rows × 15 columns
Trying to find the actual profiles of the mentors identified with really high scores according to our model.
Looking at the graph below(for different feild of studies) and searching for the actual authors -
fig = px.scatter(df_mentors.sample(n=100000, random_state=7).dropna(),
x="mentorship_score",
y="log_mentee_count",
color="fos",
hover_data=['authors_ai2_id'])
fig.show()
Author profile (Chemistry)
"Paquette had authored more than 1000 papers, 38 book chapters, and 17 books, and had guided approximately 150 graduate students to their Ph.D. degrees." - Wikipedia
Image(filename = "chemistry.png")
Author profile (Medicine)
"He led a team identifying the SARS coronavirus that caused the SARS pandemic of 2003–4, and traced its genetic origins to wild bats. During the ongoing COVID-19 pandemic, he has acted as expert adviser to the Hong Kong government." - Wikipedia
fig = px.scatter(df_mentors.sample(n=100000, random_state=2)[df_mentors['fos']=='Medicine'],
x="mentorship_score",
y="h_index",
color="fos",
hover_data=['authors_ai2_id'])
fig.show()
/net/nfs2.s2-research/shaurya/sw/miniconda3/envs/scrapy/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index. """Entry point for launching an IPython kernel.
Author profile (Economics)
"He is ranked among the top 100 economists[1] in the world according to IDEAS/RePEc, and is by far the most prolific economist" - Wikipedia
fig = px.scatter(df_mentors.sample(n=100000, random_state=2)[df_mentors['fos']=='Economics'],
x="mentorship_score",
y="log_mentee_count",
color="fos",
hover_data=['authors_ai2_id'])
fig.show()
/net/nfs2.s2-research/shaurya/sw/miniconda3/envs/scrapy/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
Author profile (Computer Science - Electrical Engineering)
"He has supervised 91 Ph.D. dissertations and co-authored two books on Low Power CMOS VLSI Design (John Wiley & McGraw Hill)" - Wikipedia
Lets look at the correlation between other author metrics and mentorship score.
plt.figure(figsize=(10, 8))
sns.heatmap(df_mentors[['mentorship_score', 'mentorship_score_mean',
'mentee_count', 'h_index', 'paper_count',
'citation_count']].corr(), vmin=-1, vmax=1, annot=True)
<AxesSubplot:>
g = sns.FacetGrid(df_mentors, col="fos",col_wrap=5)
g.fig.suptitle('log_paper_count vs mentorship_score', fontsize=20)
g.map(sns.scatterplot, "log_paper_count", "mentorship_score")
<seaborn.axisgrid.FacetGrid at 0x7fe2e6bb2b10>
df_mentee_count = (
df_mentors.groupby("fos")["log_mentee_count"]
.apply(list)
.reset_index(name="mentee_count_list")
)
dict_count = df_mentee_count.set_index('fos')['mentee_count_list'].to_dict()
figure(figsize=(12, 10), dpi=80)
labels, data = [*zip(*dict_count.items())]
labels, data = dict_count.keys(), dict_count.values()
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels, rotation = 45)
plt.ylabel('log mentee count')
plt.show()
df_mentee_count = (
df_mentors.groupby("fos")["log_mentorship_score"]
.apply(list)
.reset_index(name="log_mentorship_score_list")
)
dict_count = df_mentee_count.set_index('fos')['log_mentorship_score_list'].to_dict()
figure(figsize=(12, 10), dpi=80)
labels, data = [*zip(*dict_count.items())]
labels, data = dict_count.keys(), dict_count.values()
plt.boxplot(data)
plt.xticks(range(1, len(labels) + 1), labels, rotation = 45)
plt.ylabel('log mentorship_score')
plt.show()
g = sns.JointGrid(data=df_mentors, x="mentorship_score_mean", y="citation_count")
g.plot(sns.scatterplot, sns.histplot, alpha=.7, edgecolor=".2", linewidth=.5)
<seaborn.axisgrid.JointGrid at 0x7fe30779bb50>